# ------------------------------------------------------------------------------# Canton S acclimation phenotype figure# March 31, 2023# TS O'Leary# ------------------------------------------------------------------------------# Description -----# Canton S phenotype figure# Load librarieslibrary(tidyverse)# Load datadat <-read_csv(here::here("data/raw/pheno/embryo_acc_pheno.csv"))# Filter only Canton S datadat <- dat |>filter(Genotype =="CantonS") |>filter(Stage =="Early")# Count the number of eggs per acclimation treatmentdat |>group_by(Acclimation) |>summarise(total_eggs =sum(Number_Eggs))# Acclimation effectdat |>glm(Survival ~ Acclimation, data = .,weights = Number_Eggs,family =binomial(link ="logit"))
00_mkref.sh
Code
#!/bin/bash# Make a reference package for D. melanogaster ---------------------------------# Directions for running this script -------------------------------------------# 0. Be sure the following things have happened before beginning# - Download FASTA and GTF files for D. melanogaster from Ensembl# - Create a contig file according to 10X Genomics instructions# 1. In the command line, run the following command: bash path/to/this/file.sh# Set up directories and variables ---------------------------------------------# Move to the directory where you want the output files to be savedcd /netfiles02/lockwood_lab/heater/data/processed/seq/ref/# Point towards installed cellranger-arc programexport PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH# # Set variables for the script# ref_path="/netfiles02/lockwood_lab/heater/data/processed/seq/ref/"# libraries_path="/netfiles02/lockwood_lab/heater/data/raw/seq/libraries/"# Create a raw unfiltered reference package ------------------------------------cellranger-arc mkref --config=BDGP6.32.config# Filter the GTF to restrict it to protein-coding, lncRNA, antisense, and # immune-related genes as 10X Genomics has done for their human and mouse refscellranger-arc mkgtf Drosophila_melanogaster.BDGP6.32.109.gtf \ Drosophila_melanogaster.BDGP6.32.109_filtered.gtf \--attribute=gene_biotype:protein_coding \--attribute=gene_biotype:lncRNA \--attribute=gene_biotype:antisense \--attribute=gene_biotype:IG_LV_gene \--attribute=gene_biotype:IG_V_gene \--attribute=gene_biotype:IG_V_pseudogene \--attribute=gene_biotype:IG_D_gene \--attribute=gene_biotype:IG_J_gene \--attribute=gene_biotype:IG_J_pseudogene \--attribute=gene_biotype:IG_C_gene \--attribute=gene_biotype:IG_C_pseudogene \--attribute=gene_biotype:TR_V_gene \--attribute=gene_biotype:TR_V_pseudogene \--attribute=gene_biotype:TR_D_gene \--attribute=gene_biotype:TR_J_gene \--attribute=gene_biotype:TR_J_pseudogene \--attribute=gene_biotype:TR_C_gene# Create a filtered reference package ------------------------------------------cellranger-arc mkref --config=BDGP6.32.filtered.config
01_count.sh
Code
#!/bin/bash# Directions for running this script -------------------------------------------# 0. Be sure the following things have happened before beginning# - Demultiplex all samples# - Create reference package# - Create a separate libraries csv file pointing to demuxed fastq files# - Set up slurm.template file according to 10X Genomics instructions in # cellranger-arc-2.0.2/external/martian/jobmanagers# 1. In the command line, run the following command: sbatch path/to/this/file.sh# Set up directories and variables ---------------------------------------------# Move to the directory where you want the output files to be savedcd /netfiles02/lockwood_lab/heater/data/processed/seq/samples/# Point towards installed cellranger-arc programexport PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH# Set variables for the scriptref_path="/netfiles02/lockwood_lab/heater/data/raw/seq/ref/BDGP6_32_filtered"libraries_path="/netfiles02/lockwood_lab/heater/data/raw/seq/libraries/"n_cores=32mem=64# Run cellranger-arc count for each sample -------------------------------------# Sample 18°C Rep 1cellranger-arc count --id=18C_Rep1 \--reference=${ref_path} \--libraries=${libraries_path}/18C_Rep1.csv \--localcores=${n_cores} \--localmem=${mem} \--jobmode=slurm# Sample 18°C Rep 2cellranger-arc count --id=18C_Rep2 \--reference=${ref_path} \--libraries=${libraries_path}/18C_Rep2.csv \--localcores=${n_cores} \--localmem=${mem} \--jobmode=slurm# Sample 25°C Rep 1cellranger-arc count --id=25C_Rep1 \--reference=${ref_path} \--libraries=${libraries_path}/25C_Rep1.csv \--localcores=${n_cores} \--localmem=${mem} \--jobmode=slurm# Sample 25°C Rep 2cellranger-arc count --id=25C_Rep2 \--reference=${ref_path} \--libraries=${libraries_path}/25C_Rep2.csv \--localcores=${n_cores} \--localmem=${mem} \--jobmode=slurm
02_aggr.sh
Code
#!/bin/bash# Aggregate together all samples -----------------------------------------------# Directions for running this script -------------------------------------------# 0. Before running run cellranger-arc count for all samples and create # aggr_libraries.csv file pointing to atac_fragments.tsv.gz and # gex_molecule_info.h5 files per 10X Genomics instructions.# 1. In the command line, run the following command: sbatch path/to/this/file.sh# Request cluster resources ----------------------------------------------------# Specify partition#SBATCH --partition=bluemoon# Request nodes#SBATCH --nodes=1#SBATCH --ntasks-per-node=1#SBATCH --cpus-per-task=12# Request memory for the entire job -- you can request --mem OR --mem-per-cpu#SBATCH --mem=31000# Reserve walltime -- hh:mm:ss -- 30 hrs max below#SBATCH --time=30:00:00# Name this job#SBATCH --job-name=aggr_all# Name output of this job using %x=job-name and %j=job-id#SBATCH --output=logs/%x_%j.out # Echo some useful and interesting information echo "Starting sbatch script at: `date`"echo " running host: ${SLURMD_NODENAME}"echo " assigned nodes: ${SLURM_JOB_NODELIST}"echo " partition used: ${SLURM_JOB_PARTITION}"echo " jobid: ${SLURM_JOBID}"# Set up directories and variables ---------------------------------------------# Move to the directory where files will be aggregatedcd /netfiles02/lockwood_lab/heater/data/processed/seq/# Point towards installed cellranger-arc programexport PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH# Set variables for the scriptref_path="/netfiles02/lockwood_lab/heater/data/raw/seq/ref/BDGP6_32_filtered"libraries_path="/netfiles02/lockwood_lab/heater/data/processed/seq/samples/aggr_libraries.csv"# Run cellranger-arc aggr to aggregate all samples together --------------------cellranger-arc aggr --id=all \--csv=${libraries_path} \--reference=${ref_path} \--normalize=depth
00_create_seurat_object.R
Code
# ------------------------------------------------------------------------------# Load data and set up Seurat object# May 11, 2023# TS O'Leary# ------------------------------------------------------------------------------# Description # Load in the RNA-Seq count data and the ATAC-Seq peak fragment data and save as# an rds file to be used for quality control and filtering.# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)library(AnnotationHub)# Load the RNA-Seq and ATAC-Seq data -------------------------------------------data_dir <- here::here("data/processed/seq/all/outs/")raw_feature_dir <-paste0(data_dir, "raw_feature_bc_matrix")frag_path <-paste0(data_dir, "atac_fragments.tsv.gz")counts <-Read10X(raw_feature_dir)# Get gene annotations for dm6 -------------------------------------------------BDGP6.32<-query(AnnotationHub(), c("EnsDb", "Drosophila melanogaster", "109"))[[1]]annotation <-GetGRangesFromEnsDb(BDGP6.32)# Meta data --------------------------------------------------------------------# Create meta data for all cells based on the GEM well suffix. From 10X Genomics# "This number, which indicates which GEM well the barcode sequence came from, # is called the GEM well suffix. The numbering of the GEM wells will reflect the # order that the GEM wells were provided in the Aggregation CSV."meta_data <-tibble(cellnames =colnames(counts[[1]])) |>mutate(orig.ident =str_extract(cellnames, "[1-4]")) |>full_join(tibble::tribble(~orig.ident, ~sample_name, ~acc_temp,"1", "18C_Rep1", "18°C","2", "18C_Rep2", "18°C","3", "25C_Rep1", "25°C","4", "25C_Rep2", "25°C" ), by ="orig.ident") |>column_to_rownames("cellnames")# Create a Seurat object containing the RNA datadat <-CreateSeuratObject(project ="heater",counts = counts$`Gene Expression`,assay ="RNA",names.delim ="-",meta.data = meta_data)# Create ATAC assay and add it to the objectdat[["ATAC"]] <-CreateChromatinAssay(counts = counts$Peaks,sep =c(":", "-"),fragments = frag_path,annotation = annotation,validate.fragments =FALSE)# Save raw Seurat objectsaveRDS(dat, here::here("data/processed/seurat_object/00_dat_raw.rds"))# Create Seurat object from the filtered matrix from 10X Cell Ranger ARC -------# Load filtered datafiltered_feature_dir <-paste0(data_dir, "filtered_feature_bc_matrix")counts <-Read10X(filtered_feature_dir)# Meta data for counts from filtered countsmeta_data <-tibble(cellnames =colnames(counts[[1]])) |>mutate(orig.ident =str_extract(cellnames, "[1-4]")) |>full_join(tibble::tribble(~orig.ident, ~sample_name, ~acc_temp,"1", "18C_Rep1", "18°C","2", "18C_Rep2", "18°C","3", "25C_Rep1", "25°C","4", "25C_Rep2", "25°C" ), by ="orig.ident") |>column_to_rownames("cellnames")# Create a Seurat object containing the RNA datadat <-CreateSeuratObject(project ="heater",counts = counts$`Gene Expression`,assay ="RNA",names.delim ="-",meta.data = meta_data)# Create ATAC assay and add it to the objectdat[["ATAC"]] <-CreateChromatinAssay(counts = counts$Peaks,sep =c(":", "-"),fragments = frag_path,annotation = annotation)# Save 10X Cell Ranger ARC filtered Seurat objectsaveRDS(dat, here::here("data/processed/seurat_object/00_dat_10x_cells.rds"))
01_quality_control_filtering.R
Code
# ------------------------------------------------------------------------------# Quality control and filtering of low-quality cells# March 27, 2023# TS O'Leary# ------------------------------------------------------------------------------# Description -----# Basic quality control metrics and filtering out non-cell barcodes from the 10X # Cell Ranger ARC called cells.# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)# Load 10X filtered datadat <-readRDS( here::here("data/processed/seurat_object/00_dat_10x_cells.rds"))# Calculate Nucleosome Signal and TSS Enrichment for ATAC QC metricsDefaultAssay(dat) <-"ATAC"dat <-NucleosomeSignal(dat)dat <-TSSEnrichment(dat)# Calculate the percentage of RNA reads that map to mitochondrial genes DefaultAssay(dat) <-"RNA"dat[["percent.mt"]] <-PercentageFeatureSet( dat,pattern ="^mt:")# Calculate the percentage of RNA reads that map to ribosomal genesdat[["percent.ribo"]] <-PercentageFeatureSet( data, pattern ="^Rp[S|L]")# Save object with Nucleosome, TSS, and mitochondria meta.data added saveRDS(dat, "data/processed/seurat_object/01_dat_10x_cells.rds")# Filter out barcodes with low counts or high mitochondrial content ------------# Define filtering thresholdslow_ATAC <-800low_RNA <-200max_mt <-5#### Calderon did 25 % for both mt and rb -- ask Seth? Maybe#max_rb <- 5# Subset 10x data based on thresholdsdat <- dat |>subset( nCount_RNA >= low_RNA & nCount_ATAC >= low_ATAC & percent.mt < max_mt # &# percent.ribo < max_rb )# Save objectsaveRDS(dat, "data/processed/seurat_object/01_dat_10x_filtered.rds")# Create a filtered fragment file to be used in the amulet doublet finder ------# Path to fragment file with all barcodes includeddata_dir <- here::here("data/processed/seq/all/outs/")frag_path <-paste0(data_dir, "atac_fragments.tsv.gz")# Write the file with fragments from only the filtered barcodesFilterCells(frag_path,Cells(dat),outfile =paste0(data_dir, "atac_fragments_cells.tsv.gz"))
02_initial_cluster.R
Code
# ------------------------------------------------------------------------------# Dimension reduction and clustering# May 11, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)library(clustree)# Load datadat <-readRDS( here::here("data/processed/seurat_object/01_dat_10x_filtered.rds"))# Run the standard workflow for visualization and clustering -------------------# This initial clustering is just for the multiplets plots# Set RNA to the default assay set for the following set of operations DefaultAssay(dat) <-"RNA"# Log-Normalize datadat <-NormalizeData(dat)# Find variable featuresdat <-FindVariableFeatures(dat)# Scale and center datadat <-ScaleData(dat)# Run principle component analysis dat <-RunPCA(dat)# Run UMAPdat <-RunUMAP(dat, dims =1:30)# Construct nearest-neighbor graphdat <-FindNeighbors(dat, dims =1:30)# Finding the correct resolution of cluster -----# Run on a bunch of different resolutionsdat <-FindClusters(dat, resolution =seq(0.01, 0.1, 0.01))# Look at the cluster treeclustree::clustree(dat)# Remove the RNA_snn columns addeddat@meta.data <- dat@meta.data |>select(!contains("RNA_snn"))# Set the final clustersdat <-FindClusters(dat, resolution =0.03)# Save datasaveRDS( dat, here::here("data/processed/seurat_object/02_dat_initial_cluster.rds"))
# ------------------------------------------------------------------------------# Remove multiplets# May 23, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librariesrequire(tidyverse)require(Seurat)# Load multiplet barcodesdoublet_amulet <-readRDS( here::here("data/processed/qc/doublet_amulet.rds") ) |>filter(q.value <0.05) |>rownames()doublet_finder <-readRDS(here::here("data/processed/qc/doublet_finder.rds"))# Load seurat objectdat <-readRDS( here::here("data/processed/seurat_object/02_dat_initial_cluster.rds")) # Mark amulet described multiplet barcodesdat$doublet_amulet <-ifelse(colnames(dat) %in% doublet_amulet, "Multiplet","Singlet")# Mark amulet described multiplet barcodesdat$doublet_finder <-ifelse(colnames(dat) %in% doublet_finder,"Multiplet","Singlet")# Save dat with multiplets markedsaveRDS(dat, here::here("data/processed/seurat_object/05_dat_multiplets.rds"))# Remove all multiplets from the data setdat <- dat |>subset(doublet_amulet =="Singlet"& doublet_finder =="Singlet")# There are still two outlier barcodes with exceptionally high nCount_RNA when# compared to the remaining high quality nuclei – 12-13k compared to 6-7k for # the next highest cells -- so let's remove those two outliers nowdat <- dat |>subset(nCount_RNA <10000)# Remove cluster info from that initial clusteringdat@meta.data <- dat@meta.data |>select(-seurat_clusters) |>select(!contains("snn"))# Save dat with multiplets removedsaveRDS(dat, here::here("data/processed/seurat_object/05_dat_filtered.rds"))
06_qc_data_cells.R
Code
# ------------------------------------------------------------------------------# Quality control and filtering of low-quality cells# May 25, 2023# TS O'Leary# ------------------------------------------------------------------------------# Description -----# Quality control metrics of final cells# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)library(AnnotationHub)# Load 10X filtered datadat <-readRDS( here::here("data/processed/seurat_object/05_dat_filtered.rds"))# Load path to fragment file for later countingdata_dir <- here::here("data/processed/seq/all/outs/")frag_path <-paste0(data_dir, "atac_fragments.tsv.gz")# Get gene annotations for dm6 -------------------------------------------------BDGP6.32<-query(AnnotationHub(), c("EnsDb", "Drosophila melanogaster", "109"))[[1]]annotation <-GetGRangesFromEnsDb(BDGP6.32)seqlevelsStyle(annotation) <-'UCSC'# Calculate additional QC metrics ----------------------------------------------# Calculate TSS Enrichment matrix for the cells that are leftDefaultAssay(dat) <-"ATAC"dat <-TSSEnrichment(dat, fast =FALSE)# Counting fraction of reads in peaks -----# Count fragmentstotal_fragments <-CountFragments(fragments = frag_path, cells =Cells(dat))# Add to metadatadat$fragments <- total_fragments[total_fragments$CB ==colnames(dat), "frequency_count"]# Peak calling -----------------------------------------------------------------# Installed macs2 using the R package Herper# Path to minicondapath_to_miniconda <-"/slipstream_old/home/thomasoleary/.local/share/r-miniconda"# Path to macs2 for CallPeaks functionmacs2_path <-paste0(path_to_miniconda, "/envs/PeakCalling_analysis/bin/macs2")# Call peakspeaks <-CallPeaks( dat, macs2.path = macs2_path,effective.genome.size =1.25e8)# Create the peak matrixpeak_matrix <-FeatureMatrix(fragments =Fragments(dat),features = peaks)# Create a new assay using the MACS2 peak set and add it to the Seurat objectdat[["peaks"]] <-CreateChromatinAssay(counts = peak_matrix,fragments = frag_path,annotation = annotation)# Calculate fraction of reads in peaksdat <-FRiP( dat,assay ="peaks", total.fragments ="fragments")# Fraction of reads in TSS region ----------------------------------------------# Get the TSS sitesTSS_sites <-GetTSSPositions( dat@assays$ATAC@annotation)# # Get the 2000 bp upstream and 200 bp downstream region with promoter# # This is how TSS was defined in Calderon et al. 2022promoter_region <-promoters(TSS_sites)dat$nCount_TSS <-CountsInRegion( dat, assay ="ATAC", regions = TSS_sites)dat$nCount_promoter <-CountsInRegion( dat, assay ="ATAC", regions = promoter_region)# Calculatedat$FRiT <- dat$nCount_TSS/dat$fragments# Save seurat object with added infosaveRDS(dat, here::here("data/processed/seurat_object/06_dat_qc.rds"))
07_cluster.R
Code
# ------------------------------------------------------------------------------# Dimension reduction and clustering# May 30, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)library(clustree)# Load datadat <-readRDS( here::here("data/processed/seurat_object/06_dat_qc.rds"))# Joint Clustering based on Signac tutorial ------------------------------------# https://stuartlab.org/signac/articles/pbmc_multiomic.html# RNA data processingDefaultAssay(dat) <-"RNA"dat <-SCTransform(dat)dat <-RunPCA(dat)# ATAC data processingDefaultAssay(dat) <-"peaks"dat <-FindTopFeatures(dat, min.cutoff =5)dat <-RunTFIDF(dat)dat <-RunSVD(dat)# The first lsi component often captures sequencing depth rather than biological # variation so we should not use this first component in further analysis.# See plot below for ~-1 corr with depth and the first componentDepthCor(dat)# Build a joint neighbor graph using both assaysdat <-FindMultiModalNeighbors(object = dat,reduction.list =list("pca", "lsi"), dims.list =list(1:50, 2:40),verbose =TRUE)# Build a joint UMAPdat <-RunUMAP(object = dat,nn.name ="weighted.nn",assay ="RNA",verbose =TRUE)# Build a joint t-SNEdat <-RunTSNE(object = dat,nn.name ="weighted.nn",assay ="RNA",verbose =TRUE)# Find Clusters based on the LSIdat <-FindClusters(dat, resolution =0.2,graph.name ="wknn", algorithm =3)# Save datasaveRDS(dat, here::here("data/processed/seurat_object/07_dat_cluster.rds"))################################################################################# To see how and if the dimensional reduction of only ATAC or RNA libraries # looks different -- save these data below# RNA-only dim-reduction -------------------------------------------------------# Load datadat <-readRDS( here::here("data/processed/seurat_object/07_dat_cluster.rds"))# RNA data processingDefaultAssay(dat) <-"RNA"dat <-SCTransform(dat)dat <-RunUMAP(RunPCA(dat), dims =c(1:50))# Save dat with RNA only UMAPsaveRDS(dat, here::here("data/processed/seurat_object/07_dat_rna_umap.rds"))# ATAC-only dim-reduction ------------------------------------------------------# Load datadat <-readRDS( here::here("data/processed/seurat_object/07_dat_cluster.rds"))# ATAC UMAP quickDefaultAssay(dat) <-"peaks"dat <- dat |>ScaleData() |>RunPCA() |>RunUMAP(dims =c(1:50))# Save dat with ATAC only UMAPsaveRDS(dat, here::here("data/processed/seurat_object/07_dat_atac_umap.rds"))
08_cluster_markers.R
Code
# ------------------------------------------------------------------------------# Differential expression and accessibility# June 13, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)# Load datadat <-readRDS(here::here("data/processed/seurat_object/07_dat_cluster.rds"))# Find markers of each cluster that are conserved between the two conditions ---# Create empty list to populate in the for loopmarkers <-list()# Loop through each cluster individuallyfor (i inlevels(dat)) { markers[[i]] <-FindConservedMarkers(dat, ident.1 = i,grouping.var ="acc_temp",only.pos =TRUE) |>rownames_to_column("gene")}# Combine markers for all clustersmarkers <-bind_rows(markers, .id ="cluster")# Number of markers per clustermarkers %>%filter(max_pval <0.05) |>group_by(cluster) |>tally()# Save markers objectsaveRDS(markers, here::here("data/processed/genes/markers.rds"))
09_cluster_annotation.R
Code
# ------------------------------------------------------------------------------# Annotating Seurat Clusters using Fisher's enrichment of BDGP in situ data# June 12, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librarieslibrary(tidyverse)# Load data --------------------------------------------------------------------# Berkeley Drosophila Genome Project in situ database# https://insitu.fruitfly.org/insitu-mysql-dump/insitu_annot.csv.gzinsitu_annot <-read_csv(here::here("data/raw/annot/insitu_annot.csv"))# Cluster annotations from Calderon data at 4 to 6 hours cluster_annot_calderon <-read_csv( here::here("data/raw/annot/calderon_cluster_annot.csv")) |>filter(`time window (modeled)`=="4-6") |> dplyr::rename(cluster =`cluster #`) |> dplyr::select(cluster, `selected annotation (manual)`) |>distinct(`selected annotation (manual)`)# Marker genes from the Seurat clusters -- all markers with pval < 0.05markers <-readRDS(here::here("data/processed/genes/markers.rds")) |>group_by(cluster) |>filter(max_pval <0.05)# Fisher's Exact Test for enrichment of cell-type specific marker genes --------# Reference panel of a specific level of the in situ dataref_panel <- insitu_annotcluster_annot <-NULLpb <- progress::progress_bar$new(total =length(unique(markers$cluster)) *length(unique(ref_panel$annot)))# Loop through each cluster present in the query markers datafor (cluster_i inunique(markers$cluster)) {# Unique markers for a cluster query_markers <- markers |>filter(cluster == cluster_i) |>ungroup() |>distinct(gene) |>deframe()# Loop through each annotation type in the referencefor (ref_annot_j inunique(ref_panel$annot)) {# Distinct reference markers for this annotation ref_markers <- ref_panel |>filter(annot == ref_annot_j) |>distinct(gene) |>deframe()# Collect numbers for the Fisher's exact test contingency table ------------# Number of markers in common n_common <-length(intersect(query_markers, ref_markers))# Number unique to the query n_query_unique <-length(setdiff(query_markers, ref_markers))# Number unique to the reference n_ref_unique <-length(setdiff(ref_markers, query_markers)) # Number remaining n_remain <-length(unique(c(unique(markers$gene), rownames(ref_panel)))) - n_common - n_query_unique - n_ref_unique# Proportion of query markers that are shared with the reference prop_overlap <- n_common/length(query_markers)# Contingency table con_table <-matrix(c(n_common, n_query_unique, n_ref_unique, n_remain), ncol =2)# Run the Fisher's exact test and save the p-value fet_pval <-fisher.test(con_table)$p.value# Store results df <-tibble(cluster = cluster_i,annot = ref_annot_j,prop_overlap = prop_overlap,pval = fet_pval )# Bind results together into a dataframe cluster_annot <-bind_rows(df, cluster_annot)# Print progress bar pb$tick() }}pb$terminate()# Adjusted pval for FDR correctioncluster_annot <- cluster_annot %>%mutate(padj =p.adjust(pval, method ="BH"))# Save the full annotation resultssaveRDS(cluster_annot, here::here("data/processed/annot/cluster_annot_all.rds"))# Manual annotations by TSO using the 11 annotations present in at 4 to 6 hours# of development in the Calderon datacluster_annot_manual <-c("0"="ectoderm primordium","1"="mesoderm primordium","2"="ventral nerve cord primordium","3"="endoderm primordium","4"="ectoderm primordium","5"="muscle system primordium","6"="tracheal primordium","7"="ventral nerve cord primordium","8"="plasmatocytes anlage","9"="amnioserosa","10"="unknown")# Add in the consensus resultscluster_annot_manual <- cluster_annot |>filter(padj <0.05) |>mutate(cluster =as.numeric(cluster)) |>group_by(cluster) |>slice_min(padj, n =3) |>summarise(top_3 =paste0(annot, collapse ="; ")) |>mutate(annot_manual = cluster_annot_manual) |>select(cluster, annot_manual, top_3)# Save the consensus resultssaveRDS( cluster_annot_manual, here::here("data/processed/annot/cluster_annot_manual.rds"))
10_link_peaks.R
Code
# ------------------------------------------------------------------------------# Link peaks to genes# June 06, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librarieslibrary(tidyverse)library(Seurat)library(Signac)library(BSgenome.Dmelanogaster.UCSC.dm6)# Load datadat <-readRDS(here::here("data/processed/seurat_object/07_dat_cluster.rds"))seqlevelsStyle(BSgenome.Dmelanogaster.UCSC.dm6) <-"NCBI"# Set assayDefaultAssay(dat) <-"peaks"# Compute the GC content for each peakdat <-RegionStats( dat, genome = BSgenome.Dmelanogaster.UCSC.dm6)# Link peaks to genesdat <-LinkPeaks(object = dat,peak.assay ="peaks",expression.assay ="SCT")# Save datasaveRDS(dat, here::here("data/processed/seurat_object/10_dat_linked.rds"))
11_pseudobulk_DESeq2.R
Code
# ------------------------------------------------------------------------------# Pseudobulk analysis# June 08, 2023# TS O'Leary# ------------------------------------------------------------------------------# Load librariesrequire(tidyverse)require(Seurat)require(Signac)require(DESeq2)# Load datadat <-readRDS( here::here("data/processed/seurat_object/06_dat_qc.rds"))# Aggregate samples data into counts matrixcounts <-AggregateExpression(dat, group.by ="sample_name",assays ="RNA",slot ="counts",return.seurat =FALSE)[[1]]# Create metadata for counts matrixmetadata <-tibble(samples =c("18C_Rep1","18C_Rep2","25C_Rep1","25C_Rep1"),acc_temp =as.factor(c("18C", "18C", "25C", "25C")))# Create DESeq Data Setdds <-DESeqDataSetFromMatrix(countData = counts,colData = metadata,design =~ acc_temp)# Filter out genes with low counts across the four sampleskeep_genes <-rowSums(counts(dds)) >=10dds <- dds[keep_genes,]# Run DESeq2dds <-DESeq(dds)# Generate results objectres <-results(dds, name ="acc_temp_25C_vs_18C")deg_res <- res |>as_tibble(rownames ="gene")degs <- res |>as_tibble(rownames ="gene") |> dplyr::filter(padj<0.05) |>arrange(padj)# Save resultssaveRDS(res, here::here("output/degs/pseudobulk_DESeq_res.rds"))saveRDS(degs, here::here("output/degs/pseudobulk_DESeq.rds"))write_csv(degs |>select(gene), here::here("output/degs/pseudobulk_DESeq_gene_names.csv"))
pheno.R
Code
# ------------------------------------------------------------------------------# Canton S acclimation phenotype figure# March 31, 2023# TS O'Leary# ------------------------------------------------------------------------------# Description -----# Canton S phenotype figure# Load librarieslibrary(tidyverse)# Load datadat <-read_csv(here::here("data/raw/pheno/embryo_acc_pheno.csv"))# Filter only Canton S datadat <- dat |>filter(Genotype =="CantonS") |>filter(Stage =="Early")# Count the number of eggs per acclimation treatmentdat |>group_by(Acclimation) |>summarise(total_eggs =sum(Number_Eggs))# Plot Canton S Survivaldat |>mutate(Acclimation =paste0(Acclimation, "°C")) |>ggplot(aes(x = Acclimation,y = Survival*100)) +geom_boxplot(width =0.3,color ="grey20",outlier.shape =NA) + ggbeeswarm::geom_beeswarm(size =3,cex =2,shape =21,fill ="grey50",color ="grey20",alpha =0.9) +xlab("Acclimation temperature") +scale_y_continuous(name ="Survival after acute heat shock",limits =c(0, 100),expand =expansion(mult =c(0, 0.05)),labels =function(x) paste0(x, "%")) + cowplot::theme_minimal_hgrid(font_family ="Myriad Pro")# Save two sizesggsave(here::here("output/figs/pheno/cantonS_survival.png"), height =16, width =24,units ="cm")ggsave(here::here("output/figs/pheno/cantonS_survival_small.png"), height =12, width =18,units ="cm")
Criteria for filtering all barcodes to high-quality nuclei
We used Cell Ranger ARC called cell barcodes – algorithm described here. 14,301 barcodes out of 2,215,183.
Because the Cell Ranger ARC cell calling algorithm is very permissive to barcodes with very low counts (i.e., a minimum of a single count in each library), barcodes were additionally filtered to a low count threshold in both the ATAC and RNA libraries based on the clearly defined population of cells in the RNA and ATAC count scatterplot. Additionally, barcodes with more than 5% of RNA reads mapping to genes on the mitochondrial genome were excluded. A total of 886 barcodes were filtered out for 13,145 left.
Multiplets were filtered out using two independent methods, relying on either the ATAC or RNA libraries to call multiplets. AMULET relies on the assumption that in snATAC-seq of diploids there should be at most two overlapping fragments with the same cell barcode. The presence of more than two overlapping fragments is a potential indication of a multiplet. Doublets were also identified using the RNA-seq libraries with DoubletFinder. After removing the 1,103 multiplet barcodes, there were still two barcodes that had substantially higher number of RNA reads after all filtering (i.e., n_Count_RNA around 12-13k when the next highest are around 6-7k). So I removed those two for a final number of 12,040 high quality nuceli barcodes.
1.1 Knee plot
Figure 1: Knee plot of the number of ATAC reads and RNA reads in the raw data.